Udacity Exercises/Assignment6.py

   1 # coding: utf-8
   2
   3 # https://raw.githubusercontent.com/hankcs/udacity-deep-learning/master/6_lstm.py
   4
   5 # Deep Learning
   6 # =============
   7 #
   8 # Assignment 6
   9 # ------------
  10 #
  11 # After training a skip-gram model in `5_word2vec.ipynb`, the goal of this notebook is to train a LSTM character model over [Text8](http://mattmahoney.net/dc/textdata) data.
  12
  13 # In[ ]:
  14
  15 # These are all the modules we'll be using later. Make sure you can import them
  16 # before proceeding further.
  17 from __future__ import print_function
  18 import os
  19 import numpy as np
  20 import random
  21 import string
  22 import tensorflow as tf
  23 import zipfile
  24 from six.moves import range
  25 from six.moves.urllib.request import urlretrieve
  26
  27 # In[ ]:
  28
  29 url = 'http://mattmahoney.net/dc/'
  30
  31
  32 def maybe_download(filename, expected_bytes):
  33     """Download a file if not present, and make sure it's the right size."""
  34     if not os.path.exists(filename):
  35         filename, _ = urlretrieve(url + filename, filename)
  36     statinfo = os.stat(filename)
  37     if statinfo.st_size == expected_bytes:
  38         print('Found and verified %s' % filename)
  39     else:
  40         print(statinfo.st_size)
  41         raise Exception(
  42             'Failed to verify ' + filename + '. Can you get to it with a browser?')
  43     return filename
  44
  45
  46 filename = maybe_download('text8.zip', 31344016)
  47
  48
  49 # In[ ]:
  50
  51 def read_data(filename):
  52     f = zipfile.ZipFile(filename)
  53     for name in f.namelist():
  54         return tf.compat.as_str(f.read(name))
  55     f.close()
  56
  57
  58 text = read_data(filename)
  59 print('Data size %d' % len(text))
  60
  61 # Create a small validation set.
  62
  63 # In[ ]:
  64
  65 valid_size = 1000
  66 valid_text = text[:valid_size]
  67 train_text = text[valid_size:]
  68 train_size = len(train_text)
  69 print(train_size, train_text[:64])
  70 print(valid_size, valid_text[:64])
  71
  72 # Utility functions to map characters to vocabulary IDs and back.
  73
  74 # In[ ]:
  75
  76 vocabulary_size = len(string.ascii_lowercase) + 1  # [a-z] + ' '
  77 first_letter = ord(string.ascii_lowercase[0])
  78
  79
  80 def char2id(char):
  81     if char in string.ascii_lowercase:
  82         return ord(char) - first_letter + 1
  83     elif char == ' ':
  84         return 0
  85     else:
  86         print('Unexpected character: %s' % char)
  87         return 0
  88
  89
  90 def id2char(dictid):
  91     if dictid > 0:
  92         return chr(dictid + first_letter - 1)
  93     else:
  94         return ' '
  95
  96
  97 print(char2id('a'), char2id('z'), char2id(' '), char2id('ï'))
  98 print(id2char(1), id2char(26), id2char(0))
  99
 100 # Function to generate a training batch for the LSTM model.
 101
 102 # In[ ]:
 103
 104 batch_size = 64
 105 num_unrollings = 10
 106
 107
 108 class BatchGenerator(object):
 109     def __init__(self, text, batch_size, num_unrollings):
 110         self._text = text
 111         self._text_size = len(text)
 112         self._batch_size = batch_size
 113         self._num_unrollings = num_unrollings
 114         segment = self._text_size // batch_size
 115         self._cursor = [offset * segment for offset in range(batch_size)]
 116         self._last_batch = self._next_batch()
 117
 118     def _next_batch(self):
 119         """Generate a single batch from the current cursor position in the data."""
 120         batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
 121         for b in range(self._batch_size):
 122             batch[b, char2id(self._text[self._cursor[b]])] = 1.0
 123             self._cursor[b] = (self._cursor[b] + 1) % self._text_size
 124         return batch
 125
 126     def next(self):
 127         """Generate the next array of batches from the data. The array consists of
 128         the last batch of the previous array, followed by num_unrollings new ones.
 129         """
 130         batches = [self._last_batch]
 131         for step in range(self._num_unrollings):
 132             batches.append(self._next_batch())
 133         self._last_batch = batches[-1]
 134         return batches
 135
 136
 137 def characters(probabilities):
 138     """Turn a 1-hot encoding or a probability distribution over the possible
 139     characters back into its (most likely) character representation."""
 140     return [id2char(c) for c in np.argmax(probabilities, 1)]
 141
 142
 143 def batches2string(batches):
 144     """Convert a sequence of batches back into their (most likely) string
 145     representation."""
 146     s = [''] * batches[0].shape[0]
 147     for b in batches:
 148         s = [''.join(x) for x in zip(s, characters(b))]
 149     return s
 150
 151
 152 train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
 153 valid_batches = BatchGenerator(valid_text, 1, 1)
 154
 155 print(batches2string(train_batches.next()))
 156 print(batches2string(train_batches.next()))
 157 print(batches2string(valid_batches.next()))
 158 print(batches2string(valid_batches.next()))
 159
 160
 161 # In[ ]:
 162
 163 def logprob(predictions, labels):
 164     """Log-probability of the true labels in a predicted batch."""
 165     predictions[predictions < 1e-10] = 1e-10
 166     return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]
 167
 168
 169 def sample_distribution(distribution):
 170     """Sample one element from a distribution assumed to be an array of normalized
 171     probabilities.
 172     """
 173     r = random.uniform(0, 1)
 174     s = 0
 175     for i in range(len(distribution)):
 176         s += distribution[i]
 177         if s >= r:
 178             return i
 179     return len(distribution) - 1
 180
 181
 182 def sample(prediction):
 183     """Turn a (column) prediction into 1-hot encoded samples."""
 184     p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
 185     p[0, sample_distribution(prediction[0])] = 1.0
 186     return p
 187
 188
 189 def random_distribution():
 190     """Generate a random column of probabilities."""
 191     b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
 192     return b / np.sum(b, 1)[:, None]
 193
 194
 195 # Simple LSTM Model.
 196
 197 # In[ ]:
 198
 199 num_nodes = 64
 200
 201 graph = tf.Graph()
 202 with graph.as_default():
 203     # Parameters:
 204     # Input gate: input, previous output, and bias.
 205     ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
 206     im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
 207     ib = tf.Variable(tf.zeros([1, num_nodes]))
 208     # Forget gate: input, previous output, and bias.
 209     fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
 210     fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
 211     fb = tf.Variable(tf.zeros([1, num_nodes]))
 212     # Memory cell: input, state and bias.
 213     cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
 214     cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
 215     cb = tf.Variable(tf.zeros([1, num_nodes]))
 216     # Output gate: input, previous output, and bias.
 217     ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
 218     om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
 219     ob = tf.Variable(tf.zeros([1, num_nodes]))
 220     # Variables saving state across unrollings.
 221     saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
 222     saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
 223     # Classifier weights and biases.
 224     w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
 225     b = tf.Variable(tf.zeros([vocabulary_size]))
 226
 227
 228     # Definition of the cell computation.
 229     def lstm_cell(i, o, state):
 230         """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
 231         Note that in this formulation, we omit the various connections between the
 232         previous state and the gates."""
 233         input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
 234         forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
 235         update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
 236         state = forget_gate * state + input_gate * tf.tanh(update)
 237         output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
 238         return output_gate * tf.tanh(state), state
 239
 240
 241     # Input data.
 242     train_data = list()
 243     for _ in range(num_unrollings + 1):
 244         train_data.append(
 245             tf.placeholder(tf.float32, shape=[batch_size, vocabulary_size]))
 246     train_inputs = train_data[:num_unrollings]
 247     train_labels = train_data[1:]  # labels are inputs shifted by one time step.
 248
 249     # Unrolled LSTM loop.
 250     outputs = list()
 251     output = saved_output
 252     state = saved_state
 253     for i in train_inputs:
 254         output, state = lstm_cell(i, output, state)
 255         outputs.append(output)
 256
 257     # State saving across unrollings.
 258     with tf.control_dependencies([saved_output.assign(output),
 259                                   saved_state.assign(state)]):
 260         # Classifier.
 261         logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b)
 262         loss = tf.reduce_mean(
 263             tf.nn.softmax_cross_entropy_with_logits(
 264                 logits, tf.concat(0, train_labels)))
 265
 266     # Optimizer.
 267     global_step = tf.Variable(0)
 268     learning_rate = tf.train.exponential_decay(
 269         10.0, global_step, 5000, 0.1, staircase=True)
 270     optimizer = tf.train.GradientDescentOptimizer(learning_rate)
 271     gradients, v = zip(*optimizer.compute_gradients(loss))
 272     gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
 273     optimizer = optimizer.apply_gradients(
 274         zip(gradients, v), global_step=global_step)
 275
 276     # Predictions.
 277     train_prediction = tf.nn.softmax(logits)
 278
 279     # Sampling and validation eval: batch 1, no unrolling.
 280     sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
 281     saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
 282     saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
 283     reset_sample_state = tf.group(
 284         saved_sample_output.assign(tf.zeros([1, num_nodes])),
 285         saved_sample_state.assign(tf.zeros([1, num_nodes])))
 286     sample_output, sample_state = lstm_cell(
 287         sample_input, saved_sample_output, saved_sample_state)
 288     with tf.control_dependencies([saved_sample_output.assign(sample_output),
 289                                   saved_sample_state.assign(sample_state)]):
 290         sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))
 291
 292 # In[ ]:
 293
 294 num_steps = 7001
 295 summary_frequency = 100
 296
 297 with tf.Session(graph=graph) as session:
 298     tf.initialize_all_variables().run()
 299     print('Initialized')
 300     mean_loss = 0
 301     for step in range(num_steps):
 302         batches = train_batches.next()
 303         feed_dict = dict()
 304         for i in range(num_unrollings + 1):
 305             feed_dict[train_data[i]] = batches[i]
 306         _, l, predictions, lr = session.run(
 307             [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
 308         mean_loss += l
 309         if step % summary_frequency == 0:
 310             if step > 0:
 311                 mean_loss = mean_loss / summary_frequency
 312             # The mean loss is an estimate of the loss over the last few batches.
 313             print(
 314                 'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
 315             mean_loss = 0
 316             labels = np.concatenate(list(batches)[1:])
 317             print('Minibatch perplexity: %.2f' % float(
 318                 np.exp(logprob(predictions, labels))))
 319             if step % (summary_frequency * 10) == 0:
 320                 # Generate some samples.
 321                 print('=' * 80)
 322                 for _ in range(5):
 323                     feed = sample(random_distribution())
 324                     sentence = characters(feed)[0]
 325                     reset_sample_state.run()
 326                     for _ in range(79):
 327                         prediction = sample_prediction.eval({sample_input: feed})
 328                         feed = sample(prediction)
 329                         sentence += characters(feed)[0]
 330                     print(sentence)
 331                 print('=' * 80)
 332             # Measure validation set perplexity.
 333             reset_sample_state.run()
 334             valid_logprob = 0
 335             for _ in range(valid_size):
 336                 b = valid_batches.next()
 337                 predictions = sample_prediction.eval({sample_input: b[0]})
 338                 valid_logprob = valid_logprob + logprob(predictions, b[1])
 339             print('Validation set perplexity: %.2f' % float(np.exp(
 340                 valid_logprob / valid_size)))
 341
 342 # ---
 343 # Problem 1
 344 # ---------
 345 #
 346 # You might have noticed that the definition of the LSTM cell involves 4 matrix multiplications with the input, and 4 matrix multiplications with the output. Simplify the expression by using a single matrix multiply for each, and variables that are 4 times larger.
 347 #
 348 # ---
 349 num_nodes = 64
 350
 351 graph = tf.Graph()
 352 with graph.as_default():
 353     # Parameters:
 354     # Input gate: input, previous output, and bias.
 355     ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
 356     im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
 357     ib = tf.Variable(tf.zeros([1, num_nodes]))
 358     # Forget gate: input, previous output, and bias.
 359     fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
 360     fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
 361     fb = tf.Variable(tf.zeros([1, num_nodes]))
 362     # Memory cell: input, state and bias.
 363     cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
 364     cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
 365     cb = tf.Variable(tf.zeros([1, num_nodes]))
 366     # Output gate: input, previous output, and bias.
 367     ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
 368     om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
 369     ob = tf.Variable(tf.zeros([1, num_nodes]))
 370     # Concatenate parameters
 371     sx = tf.concat(1, [ix, fx, cx, ox])
 372     sm = tf.concat(1, [im, fm, cm, om])
 373     sb = tf.concat(1, [ib, fb, cb, ob])
 374     # Variables saving state across unrollings.
 375     saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
 376     saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
 377     # Classifier weights and biases.
 378     w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
 379     b = tf.Variable(tf.zeros([vocabulary_size]))
 380
 381
 382     # Definition of the cell computation.
 383     def lstm_cell(i, o, state):
 384         """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
 385         Note that in this formulation, we omit the various connections between the
 386         previous state and the gates."""
 387         y = tf.matmul(i, sx) + tf.matmul(o, sm) + sb
 388         y_input, y_forget, update, y_output = tf.split(1, 4, y)
 389         input_gate = tf.sigmoid(y_input)
 390         forget_gate = tf.sigmoid(y_forget)
 391         output_gate = tf.sigmoid(y_output)
 392         state = forget_gate * state + input_gate * tf.tanh(update)
 393         return output_gate * tf.tanh(state), state
 394
 395
 396     # Input data.
 397     train_data = list()
 398     for _ in range(num_unrollings + 1):
 399         train_data.append(
 400             tf.placeholder(tf.float32, shape=[batch_size, vocabulary_size]))
 401     train_inputs = train_data[:num_unrollings]
 402     train_labels = train_data[1:]  # labels are inputs shifted by one time step.
 403
 404     # Unrolled LSTM loop.
 405     outputs = list()
 406     output = saved_output
 407     state = saved_state
 408     for i in train_inputs:
 409         output, state = lstm_cell(i, output, state)
 410         outputs.append(output)
 411
 412     # State saving across unrollings.
 413     with tf.control_dependencies([saved_output.assign(output),
 414                                   saved_state.assign(state)]):
 415         # Classifier.
 416         logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b)
 417         loss = tf.reduce_mean(
 418             tf.nn.softmax_cross_entropy_with_logits(
 419                 logits, tf.concat(0, train_labels)))
 420
 421     # Optimizer.
 422     global_step = tf.Variable(0)
 423     learning_rate = tf.train.exponential_decay(
 424         10.0, global_step, 5000, 0.1, staircase=True)
 425     optimizer = tf.train.GradientDescentOptimizer(learning_rate)
 426     gradients, v = zip(*optimizer.compute_gradients(loss))
 427     gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
 428     optimizer = optimizer.apply_gradients(
 429         zip(gradients, v), global_step=global_step)
 430
 431     # Predictions.
 432     train_prediction = tf.nn.softmax(logits)
 433
 434     # Sampling and validation eval: batch 1, no unrolling.
 435     sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
 436     saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
 437     saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
 438     reset_sample_state = tf.group(
 439         saved_sample_output.assign(tf.zeros([1, num_nodes])),
 440         saved_sample_state.assign(tf.zeros([1, num_nodes])))
 441     sample_output, sample_state = lstm_cell(
 442         sample_input, saved_sample_output, saved_sample_state)
 443     with tf.control_dependencies([saved_sample_output.assign(sample_output),
 444                                   saved_sample_state.assign(sample_state)]):
 445         sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))
 446
 447 num_steps = 7001
 448 summary_frequency = 100
 449
 450 with tf.Session(graph=graph) as session:
 451     tf.initialize_all_variables().run()
 452     print('Initialized')
 453     mean_loss = 0
 454     for step in range(num_steps):
 455         batches = train_batches.next()
 456         feed_dict = dict()
 457         for i in range(num_unrollings + 1):
 458             feed_dict[train_data[i]] = batches[i]
 459         _, l, predictions, lr = session.run(
 460             [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
 461         mean_loss += l
 462         if step % summary_frequency == 0:
 463             if step > 0:
 464                 mean_loss = mean_loss / summary_frequency
 465             # The mean loss is an estimate of the loss over the last few batches.
 466             print(
 467                 'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
 468             mean_loss = 0
 469             labels = np.concatenate(list(batches)[1:])
 470             print('Minibatch perplexity: %.2f' % float(
 471                 np.exp(logprob(predictions, labels))))
 472             if step % (summary_frequency * 10) == 0:
 473                 # Generate some samples.
 474                 print('=' * 80)
 475                 for _ in range(5):
 476                     feed = sample(random_distribution())
 477                     sentence = characters(feed)[0]
 478                     reset_sample_state.run()
 479                     for _ in range(79):
 480                         prediction = sample_prediction.eval({sample_input: feed})
 481                         feed = sample(prediction)
 482                         sentence += characters(feed)[0]
 483                     print(sentence)
 484                 print('=' * 80)
 485             # Measure validation set perplexity.
 486             reset_sample_state.run()
 487             valid_logprob = 0
 488             for _ in range(valid_size):
 489                 b = valid_batches.next()
 490                 predictions = sample_prediction.eval({sample_input: b[0]})
 491                 valid_logprob = valid_logprob + logprob(predictions, b[1])
 492             print('Validation set perplexity: %.2f' % float(np.exp(
 493                 valid_logprob / valid_size)))
 494 # ---
 495 # Problem 2
 496 # ---------
 497 #
 498 # We want to train a LSTM over bigrams, that is pairs of consecutive characters like 'ab' instead of single characters like 'a'. Since the number of possible bigrams is large, feeding them directly to the LSTM using 1-hot encodings will lead to a very sparse representation that is very wasteful computationally.
 499 #
 500 # a- Introduce an embedding lookup on the inputs, and feed the embeddings to the LSTM cell instead of the inputs themselves.
 501 #
 502 # b- Write a bigram-based LSTM, modeled on the character LSTM above.
 503 #
 504 # c- Introduce Dropout. For best practices on how to use Dropout in LSTMs, refer to this [article](http://arxiv.org/abs/1409.2329).
 505 #
 506 # ---
 507 bigram_vocabulary_size = vocabulary_size * vocabulary_size
 508
 509
 510 class BigramBatchGenerator(object):
 511     def __init__(self, text, batch_size, num_unrollings):
 512         self._text = text
 513         self._text_size_in_chars = len(text)
 514         self._text_size = self._text_size_in_chars // 2
 515         self._batch_size = batch_size
 516         self._num_unrollings = num_unrollings
 517         segment = self._text_size // batch_size
 518         self._cursor = [offset * segment for offset in range(batch_size)]
 519         self._last_batch = self._next_batch()
 520
 521     def _next_batch(self):
 522         batch = np.zeros(shape=self._batch_size, dtype=np.int)
 523         for b in range(self._batch_size):
 524             char_idx = self._cursor[b] * 2
 525             ch1 = char2id(self._text[char_idx])
 526             if self._text_size_in_chars - 1 == char_idx:
 527                 ch2 = 0
 528             else:
 529                 ch2 = char2id(self._text[char_idx + 1])
 530             batch[b] = ch1 * vocabulary_size + ch2
 531             self._cursor[b] = (self._cursor[b] + 1) % self._text_size
 532         return batch
 533
 534     def next(self):
 535         batches = [self._last_batch]
 536         for step in range(self._num_unrollings):
 537             batches.append(self._next_batch())
 538         self._last_batch = batches[-1]
 539         return batches
 540
 541
 542 def bi2str(encoding):
 543     return id2char(encoding // vocabulary_size) + id2char(encoding % vocabulary_size)
 544
 545
 546 def bigrams(encodings):
 547     return [bi2str(e) for e in encodings]
 548
 549
 550 def bibatches2string(batches):
 551     s = [''] * batches[0].shape[0]
 552     for b in batches:
 553         s = [''.join(x) for x in zip(s, bigrams(b))]
 554     return s
 555
 556
 557 bi_onehot = np.zeros((bigram_vocabulary_size, bigram_vocabulary_size))
 558 np.fill_diagonal(bi_onehot, 1)
 559
 560
 561 def bi_one_hot(encodings):
 562     return [bi_onehot[e] for e in encodings]
 563
 564
 565 train_batches = BigramBatchGenerator(train_text, 8, 8)
 566 valid_batches = BigramBatchGenerator(valid_text, 1, 1)
 567
 568 print(bibatches2string(train_batches.next()))
 569 print(bibatches2string(train_batches.next()))
 570 print(bibatches2string(valid_batches.next()))
 571 print(bibatches2string(valid_batches.next()))
 572
 573
 574 def logprob(predictions, labels):
 575     """Log-probability of the true labels in a predicted batch."""
 576     predictions[predictions < 1e-10] = 1e-10
 577     return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]
 578
 579
 580 def sample_distribution(distribution):
 581     """Sample one element from a distribution assumed to be an array of normalized
 582     probabilities.
 583     """
 584     r = random.uniform(0, 1)
 585     s = 0
 586     for i in range(len(distribution)):
 587         s += distribution[i]
 588         if s >= r:
 589             return i
 590     return len(distribution) - 1
 591
 592
 593 def sample(prediction, size=vocabulary_size):
 594     """Turn a (column) prediction into 1-hot encoded samples."""
 595     p = np.zeros(shape=[1, size], dtype=np.float)
 596     p[0, sample_distribution(prediction[0])] = 1.0
 597     return p
 598
 599
 600 def one_hot_voc(prediction, size=vocabulary_size):
 601     p = np.zeros(shape=[1, size], dtype=np.float)
 602     p[0, prediction[0]] = 1.0
 603     return p
 604
 605
 606 def random_distribution(size=vocabulary_size):
 607     """Generate a random column of probabilities."""
 608     b = np.random.uniform(0.0, 1.0, size=[1, size])
 609     return b / np.sum(b, 1)[:, None]
 610
 611
 612 num_nodes = 512
 613 num_unrollings = 10
 614 batch_size = 32
 615 embedding_size = 128
 616 graph = tf.Graph()
 617 with graph.as_default():
 618     # input to all gates
 619     x = tf.Variable(tf.truncated_normal([embedding_size, num_nodes * 4], -0.1, 0.1), name='x')
 620     # memory of all gates
 621     m = tf.Variable(tf.truncated_normal([num_nodes, num_nodes * 4], -0.1, 0.1), name='m')
 622     # biases all gates
 623     biases = tf.Variable(tf.zeros([1, num_nodes * 4]))
 624     # Variables saving state across unrollings.
 625     saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
 626     saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
 627     # Classifier weights and biases.
 628     w = tf.Variable(tf.truncated_normal([num_nodes, bigram_vocabulary_size], -0.1, 0.1))
 629     b = tf.Variable(tf.zeros([bigram_vocabulary_size]))
 630     # embeddings for all possible bigrams
 631     embeddings = tf.Variable(tf.random_uniform([bigram_vocabulary_size, embedding_size], -1.0, 1.0))
 632     # one hot encoding for labels in
 633     np_one_hot = np.zeros((bigram_vocabulary_size, bigram_vocabulary_size))
 634     np.fill_diagonal(np_one_hot, 1)
 635     bigram_one_hot = tf.constant(np.reshape(np_one_hot, -1), dtype=tf.float32,
 636                                  shape=[bigram_vocabulary_size, bigram_vocabulary_size])
 637     keep_prob = tf.placeholder(tf.float32)
 638
 639
 640     # Definition of the cell computation.
 641     def lstm_cell(i, o, state):
 642         i = tf.nn.dropout(i, keep_prob)
 643         mult = tf.matmul(i, x) + tf.matmul(o, m) + biases
 644         input_gate = tf.sigmoid(mult[:, :num_nodes])
 645         forget_gate = tf.sigmoid(mult[:, num_nodes:num_nodes * 2])
 646         update = mult[:, num_nodes * 3:num_nodes * 4]
 647         state = forget_gate * state + input_gate * tf.tanh(update)
 648         output_gate = tf.sigmoid(mult[:, num_nodes * 3:])
 649         output = tf.nn.dropout(output_gate * tf.tanh(state), keep_prob)
 650         return output, state
 651
 652
 653     # Input data. [num_unrollings, batch_size] -> one hot encoding removed, we send just bigram ids
 654     tf_train_data = tf.placeholder(tf.int32, shape=[num_unrollings + 1, batch_size])
 655     train_data = list()
 656     for i in tf.split(0, num_unrollings + 1, tf_train_data):
 657         train_data.append(tf.squeeze(i))
 658     train_inputs = train_data[:num_unrollings]
 659     train_labels = list()
 660     for l in train_data[1:]:
 661         train_labels.append(tf.gather(bigram_one_hot, l))
 662
 663     # Unrolled LSTM loop.
 664     outputs = list()
 665     output = saved_output
 666     state = saved_state
 667     # python loop used: tensorflow does not support sequential operations yet
 668     for i in train_inputs:  # having a loop simulates having time
 669         # embed input bigrams -> [batch_size, embedding_size]
 670         output, state = lstm_cell(tf.nn.embedding_lookup(embeddings, i), output, state)
 671         outputs.append(output)
 672
 673     # State saving across unrollings, control_dependencies makes sure that output and state are computed
 674     with tf.control_dependencies([saved_output.assign(output), saved_state.assign(state)]):
 675         logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b)
 676         loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits,
 677                                                                       tf.concat(0, train_labels)
 678                                                                       ))
 679     # Optimizer.
 680     global_step = tf.Variable(0)
 681     learning_rate = tf.train.exponential_decay(10.0, global_step, 500, 0.9, staircase=True)
 682     optimizer = tf.train.GradientDescentOptimizer(learning_rate)
 683     gradients, v = zip(*optimizer.compute_gradients(loss))
 684     gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
 685     optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step)
 686
 687     # here we predict the embedding
 688     # train_prediction = tf.argmax(tf.nn.softmax(logits), 1, name='train_prediction')
 689     train_prediction = tf.nn.softmax(logits)
 690
 691     # Sampling and validation eval: batch 1, no unrolling.
 692     sample_input = tf.placeholder(tf.int32, shape=[1])
 693     saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
 694     saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
 695     reset_sample_state = tf.group(saved_sample_output.assign(tf.zeros([1, num_nodes])),
 696                                   saved_sample_state.assign(tf.zeros([1, num_nodes])))
 697     embed_sample_input = tf.nn.embedding_lookup(embeddings, sample_input)
 698     sample_output, sample_state = lstm_cell(embed_sample_input, saved_sample_output, saved_sample_state)
 699
 700     with tf.control_dependencies([saved_sample_output.assign(sample_output),
 701                                   saved_sample_state.assign(sample_state)]):
 702         sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))
 703
 704 num_steps = 4001
 705 summary_frequency = 100
 706 # initalize batch generators
 707
 708 with tf.Session(graph=graph) as session:
 709     tf.initialize_all_variables().run()
 710     print('Initialized')
 711     train_batches = BigramBatchGenerator(train_text, batch_size, num_unrollings)
 712     valid_batches = BigramBatchGenerator(valid_text, 1, 1)
 713     mean_loss = 0
 714     for step in range(num_steps):
 715         batches = train_batches.next()
 716         _, l, lr, predictions = session.run([optimizer, loss, learning_rate, train_prediction],
 717                                             feed_dict={tf_train_data: batches, keep_prob: 0.6})
 718         mean_loss += l
 719         if step % summary_frequency == 0:
 720             if step > 0:
 721                 mean_loss = mean_loss / summary_frequency
 722             # The mean loss is an estimate of the loss over the last few batches.
 723             print('Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
 724             mean_loss = 0
 725             labels = list(batches)[1:]
 726             labels = np.concatenate([bi_one_hot(l) for l in labels])
 727             print('Minibatch perplexity: %.2f' % float(np.exp(logprob(predictions, labels))))
 728             if step % (summary_frequency * 10) == 0:
 729                 # Generate some samples.
 730                 print('=' * 80)
 731                 for _ in range(5):
 732                     feed = np.argmax(sample(random_distribution(bigram_vocabulary_size), bigram_vocabulary_size))
 733                     sentence = bi2str(feed)
 734                     reset_sample_state.run()
 735                     for _ in range(49):
 736                         prediction = sample_prediction.eval({sample_input: [feed], keep_prob: 1.0})
 737                         feed = np.argmax(sample(prediction, bigram_vocabulary_size))
 738                         sentence += bi2str(feed)
 739                     print(sentence)
 740                 print('=' * 80)
 741             # Measure validation set perplexity.
 742             reset_sample_state.run()
 743             valid_logprob = 0
 744             for _ in range(valid_size):
 745                 b = valid_batches.next()
 746                 predictions = sample_prediction.eval({sample_input: b[0], keep_prob: 1.0})
 747                 # print(predictions)
 748                 valid_logprob = valid_logprob + logprob(predictions, one_hot_voc(b[1], bigram_vocabulary_size))
 749             print('Validation set perplexity: %.2f' % float(np.exp(valid_logprob / valid_size)))
 750
 751 # ---
 752 # Problem 3
 753 # ---------
 754 #
 755 # (difficult!)
 756 #
 757 # Write a sequence-to-sequence LSTM which mirrors all the words in a sentence. For example, if your input is:
 758 #
 759 #     the quick brown fox
 760 #
 761 # the model should attempt to output:
 762 #
 763 #     eht kciuq nworb xof
 764 #
 765 # Refer to the lecture on how to put together a sequence-to-sequence model, as well as [this article](http://arxiv.org/abs/1409.3215) for best practices.
 766 #
 767 # ---
 768 from tensorflow.models.rnn.translate import seq2seq_model
 769 import math
 770
 771 batch_size = 64
 772 num_unrollings = 19
 773
 774
 775 class Seq2SeqBatchGenerator(object):
 776     def __init__(self, text, batch_size, num_unrollings):
 777         self._text = text
 778         self._text_size = len(text)
 779         self._batch_size = batch_size
 780         self._num_unrollings = num_unrollings
 781         segment = self._text_size // num_unrollings
 782         self._cursor = [offset * segment for offset in range(batch_size)]
 783         self._last_batch = self._next_batch(0)
 784
 785     def _next_batch(self, step):
 786         """Generate a single batch from the current cursor position in the data."""
 787         batch = ''
 788         # print('text size', self._text_size)
 789         for b in range(self._num_unrollings):
 790             # print(self._cursor[step])
 791             self._cursor[step] %= self._text_size
 792             batch += self._text[self._cursor[step]]
 793             self._cursor[step] += 1
 794         return batch
 795
 796     def next(self):
 797         """Generate the next array of batches from the data. The array consists of
 798         the last batch of the previous array, followed by num_unrollings new ones.
 799         """
 800         batches = [self._last_batch]
 801         for step in range(self._batch_size):
 802             batches.append(self._next_batch(step))
 803         self._last_batch = batches[-1]
 804         return batches
 805
 806
 807 def characters(probabilities):
 808     """Turn a 1-hot encoding or a probability distribution over the possible
 809     characters back into its (most likely) character representation."""
 810     return [id2char(c) for c in np.argmax(probabilities, 1)]
 811
 812
 813 def ids(probabilities):
 814     """Turn a 1-hot encoding or a probability distribution over the possible
 815     characters back into its (most likely) character representation."""
 816     return [str(c) for c in np.argmax(probabilities, 1)]
 817
 818
 819 def batches2id(batches):
 820     """Convert a sequence of batches back into their (most likely) string
 821     representation."""
 822     s = [''] * batches[0].shape[0]
 823     for b in batches:
 824         s = [''.join(x) for x in zip(s, ids(b))]
 825     return s
 826
 827
 828 train_batches = Seq2SeqBatchGenerator(train_text, batch_size, num_unrollings)
 829 valid_batches = Seq2SeqBatchGenerator(valid_text, 1, num_unrollings)
 830
 831
 832 def rev_id(forward):
 833     temp = forward.split(' ')
 834     backward = []
 835     for i in range(len(temp)):
 836         backward += temp[i][::-1] + ' '
 837     return list(map(lambda x: char2id(x), backward[:-1]))
 838
 839
 840 batches = train_batches.next()
 841 train_sets = []
 842 batch_encs = list(map(lambda x: list(map(lambda y: char2id(y), list(x))), batches))
 843 batch_decs = list(map(lambda x: rev_id(x), batches))
 844 print('x=', ''.join([id2char(x) for x in batch_encs[0]]))
 845 print('y=', ''.join([id2char(x) for x in batch_decs[0]]))
 846
 847
 848 def create_model(forward_only):
 849     model = seq2seq_model.Seq2SeqModel(source_vocab_size=vocabulary_size,
 850                                        target_vocab_size=vocabulary_size,
 851                                        buckets=[(20, 20)],
 852                                        size=256,
 853                                        num_layers=4,
 854                                        max_gradient_norm=5.0,
 855                                        batch_size=batch_size,
 856                                        learning_rate=1.0,
 857                                        learning_rate_decay_factor=0.9,
 858                                        use_lstm=True,
 859                                        forward_only=forward_only)
 860     return model
 861
 862
 863 with tf.Session() as sess:
 864     model = create_model(False)
 865     sess.run(tf.initialize_all_variables())
 866     num_steps = 30001
 867
 868     # This is the training loop.
 869     step_time, loss = 0.0, 0.0
 870     current_step = 0
 871     previous_losses = []
 872     step_ckpt = 100
 873     valid_ckpt = 500
 874
 875     for step in range(1, num_steps):
 876         model.batch_size = batch_size
 877         batches = train_batches.next()
 878         train_sets = []
 879         batch_encs = list(map(lambda x: list(map(lambda y: char2id(y), list(x))), batches))
 880         batch_decs = list(map(lambda x: rev_id(x), batches))
 881         for i in range(len(batch_encs)):
 882             train_sets.append((batch_encs[i], batch_decs[i]))
 883
 884         # Get a batch and make a step.
 885         encoder_inputs, decoder_inputs, target_weights = model.get_batch([train_sets], 0)
 886         _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, 0, False)
 887
 888         loss += step_loss / step_ckpt
 889
 890         # Once in a while, we save checkpoint, print statistics, and run evals.
 891         if step % step_ckpt == 0:
 892             # Print statistics for the previous epoch.
 893             perplexity = math.exp(loss) if loss < 300 else float('inf')
 894             print("global step %d learning rate %.4f perplexity "
 895                   "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), perplexity))
 896             # Decrease learning rate if no improvement was seen over last 3 times.
 897             if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
 898                 sess.run(model.learning_rate_decay_op)
 899             previous_losses.append(loss)
 900
 901             loss = 0.0
 902
 903             if step % valid_ckpt == 0:
 904                 v_loss = 0.0
 905
 906                 model.batch_size = 1
 907                 batches = ['the quick brown fox']
 908                 test_sets = []
 909                 batch_encs = list(map(lambda x: list(map(lambda y: char2id(y), list(x))), batches))
 910                 # batch_decs = map(lambda x: rev_id(x), batches)
 911                 test_sets.append((batch_encs[0], []))
 912                 # Get a 1-element batch to feed the sentence to the model.
 913                 encoder_inputs, decoder_inputs, target_weights = model.get_batch([test_sets], 0)
 914                 # Get output logits for the sentence.
 915                 _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, 0, True)
 916
 917                 # This is a greedy decoder - outputs are just argmaxes of output_logits.
 918                 outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
 919
 920                 print('>>>>>>>>> ', batches[0], ' -> ', ''.join(map(lambda x: id2char(x), outputs)))
 921
 922                 for _ in range(valid_size):
 923                     model.batch_size = 1
 924                     v_batches = valid_batches.next()
 925                     valid_sets = []
 926                     v_batch_encs = list(map(lambda x: list(map(lambda y: char2id(y), list(x))), v_batches))
 927                     v_batch_decs = list(map(lambda x: rev_id(x), v_batches))
 928                     for i in range(len(v_batch_encs)):
 929                         valid_sets.append((v_batch_encs[i], v_batch_decs[i]))
 930                     encoder_inputs, decoder_inputs, target_weights = model.get_batch([valid_sets], 0)
 931                     _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, 0, True)
 932                     v_loss += eval_loss / valid_size
 933
 934                 eval_ppx = math.exp(v_loss) if v_loss < 300 else float('inf')
 935                 print("  valid eval:  perplexity %.2f" % (eval_ppx))
 936
 937     # reuse variable -> subdivide into two boxes
 938     model.batch_size = 1  # We decode one sentence at a time.
 939     batches = ['the quick brown fox']
 940     test_sets = []
 941     batch_encs = list(map(lambda x: list(map(lambda y: char2id(y), list(x))), batches))
 942     # batch_decs = map(lambda x: rev_id(x), batches)
 943     test_sets.append((batch_encs[0], []))
 944     # Get a 1-element batch to feed the sentence to the model.
 945     encoder_inputs, decoder_inputs, target_weights = model.get_batch([test_sets], 0)
 946     # Get output logits for the sentence.
 947     _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, 0, True)
 948     # This is a greedy decoder - outputs are just argmaxes of output_logits.
 949     outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
 950     print('## : ', outputs)
 951     # If there is an EOS symbol in outputs, cut them at that point.
 952     if char2id('!') in outputs:
 953         outputs = outputs[:outputs.index(char2id('!'))]
 954
 955     print(batches[0], ' -> ', ''.join(map(lambda x: id2char(x), outputs)))